This document is dedicated to cleaning the data from Experiment 1.
Load and view the data:
d <- read.csv('../../data/dst.csv')
demo <- read.csv('../../data/demo.csv')
n <- d %>%
group_by(subject) %>%
summarize(n()) %>%
nrow()
d
Initial sample size is 104.
Subjects will be excluded for:
badSubjectsList <- demo[demo$vision == 'impaired',]$subject
badSubjects <- data.frame(subject = badSubjectsList, reason = rep('Vision Impaired', length(badSubjectsList)))
badSubjectsList <- d %>%
group_by(subject) %>%
summarize(error = mean(error))
badSubjectsList %>%
ggplot(aes(x = error)) +
geom_histogram(color = 'black', fill = 'light grey') +
theme_bw() +
xlab('Error Rate')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
badSubjectsList <- badSubjectsList %>%
filter(error > .15)
badSubjects <- rbind(badSubjects, data.frame(subject = badSubjectsList$subject, reason = rep('Error rate higher than 15%', nrow(badSubjectsList))))
badSubjects
uniqueBadSubjects <- nrow(badSubjects[unique(badSubjects$subject),])
write.csv(uniqueBadSubjects, '../../data/badSubjects.csv', row.names = FALSE)
A total of 34 will be dropped based on the rejection information above.
So we have a shocking number of subjects who are just making random responses. I’m making a criterion for rejecting HITs, which is error rates higher than 35% or mean cued RTs lower than 400. These two vars are a clear diagnostic between people who are and aren’t doing the task correctly.
On second thought, I think we have a bot problem…
good <- d %>%
filter(cuedRt < 10000) %>%
group_by(subject) %>%
summarize(error = mean(error), rt = mean(cuedRt)) %>%
filter(error < .35)
bad <- d %>%
filter(cuedRt < 10000) %>%
group_by(subject) %>%
summarize(error = mean(error), rt = mean(cuedRt)) %>%
filter(error > .35)
Good performers:
good
Bad Performers:
bad
source('../identitiesAndRejections/computeRejectList.r')
## Joining, by = "subject"
## Joining, by = "subject"
That saves to a private CSV, which I’ll use to reject assignments.
Let’s have a bit of fun with this real quick:
rejectList <- read.csv('../identitiesAndRejections/rejectList.csv')
p <- d %>%
mutate(isBot = ifelse(subject %in% rejectList$subject, 'Bot', 'Human')) %>%
filter(cuedRt < 10000) %>%
group_by(subject) %>%
summarize(error = mean(error), rt = mean(cuedRt), isBot = unique(isBot)) %>%
ggplot(aes(x = error, y = rt, color = isBot)) +
geom_point() +
scale_color_manual(name = 'Turing Test', values = c(Bot = 'red', Human = 'dark green')) +
xlab('Mean Error Rate') +
ylab('Mean Cued Response Time (ms)') +
theme_bw() +
theme(legend.position = 'bottom')
ggMarginal(p = p, type = 'histogram', groupFill = TRUE)
Drop bad data
print(paste('Number of rows before removing bad subjects:', nrow(d)))
## [1] "Number of rows before removing bad subjects: 83200"
d <- d[!(d$subject %in% badSubjects$subject),]
print(paste('Number of rows after removing bad subjects:', nrow(d)))
## [1] "Number of rows after removing bad subjects: 56000"
demo <- demo[!(demo$subject %in% badSubjects$subject),]
Zoom in on everyone else:
d %>%
group_by(subject) %>%
summarize(error = mean(error)) %>%
ggplot(aes(x = error)) +
geom_histogram(color = 'black', fill = 'light grey') +
theme_bw() +
xlab('Error Rate')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
First, dropping all trials with RT > 10 s
initialRows <- nrow(d)
print(paste('Number of rows before removing trials with RTs longer than 10 s:', initialRows))
## [1] "Number of rows before removing trials with RTs longer than 10 s: 56000"
d <- d %>%
filter(cuedRt < 10000, choiceRt < 10000)
print(paste('Number of rows after removing trials with RTs longer than 10 s:', nrow(d)))
## [1] "Number of rows after removing trials with RTs longer than 10 s: 55800"
badTrials <- data.frame('Data Dropped' = round(1 - (nrow(d) / initialRows), 2), Reason = 'Trials longer than 10 s')
badTrials
Second, trials will be dropped based on subject-wise means of rts, separately for both cued and choice
## choice first
initialRows <- nrow(d)
print(paste('Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs:', initialRows))
## [1] "Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs: 55800"
d <- d %>%
group_by(subject) %>%
summarize(meanChoiceRt = mean(choiceRt), sdChoiceRt = sd(choiceRt)) %>%
inner_join(d) %>%
mutate(badChoice = ifelse(choiceRt <= meanChoiceRt - 2 * sdChoiceRt | choiceRt > meanChoiceRt + 2 * sdChoiceRt, 1, 0)) %>%
filter(badChoice == 0) %>%
select(-badChoice)
## Joining, by = "subject"
print(paste('Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs:', nrow(d)))
## [1] "Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs: 54214"
badTrials <- rbind(badTrials, data.frame('Data Dropped' = round(1 - (nrow(d) / initialRows), 2), Reason = 'Choice trials with RTs more extreme than two SDs beyond participant-wise means'))
badTrials
## now for cued responses
initialRows <- nrow(d)
print(paste('Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs:', initialRows))
## [1] "Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs: 54214"
d <- d %>%
group_by(subject) %>%
summarize(meancuedRt = mean(cuedRt), sdcuedRt = sd(cuedRt)) %>%
inner_join(d) %>%
mutate(badcued = ifelse(cuedRt <= meancuedRt - 2 * sdcuedRt | cuedRt > meancuedRt + 2 * sdcuedRt, 1, 0)) %>%
filter(badcued == 0) %>%
select(-badcued)
## Joining, by = "subject"
print(paste('Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs:', nrow(d)))
## [1] "Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs: 51942"
badTrials <- rbind(badTrials, data.frame('Data Dropped' = round(1 - (nrow(d) / initialRows), 2), Reason = 'Cued trials with RTs more extreme than two SDs beyond participant-wise means'))
badTrials
Saving out a dataset for error analysis
write.csv(d, '../../data/dstCleanErrors.csv', row.names = FALSE)
Trimming out error trials and trials following error trials
I didn’t actually say I’d trim trials following error trials in the document, so I might want to think about that some
initialRows <- nrow(d)
print(paste('Number of rows before removing error trials and trials following error trials :', initialRows))
## [1] "Number of rows before removing error trials and trials following error trials : 51942"
d <- d %>%
mutate(errorTrim = ifelse(error | shift(error), 1, 0)) %>%
filter(errorTrim == 0)
print(paste('Number of rows before removing error trials and trials following error trials :', nrow(d)))
## [1] "Number of rows before removing error trials and trials following error trials : 47897"
badTrials <- rbind(badTrials, data.frame('Data Dropped' = round(1 - (nrow(d) / initialRows), 2), Reason = 'Trimming error trials and trials following error trials'))
badTrials
That should be good.
write.csv(d, '../../data/dstClean.csv', row.names = FALSE)
write.csv(demo, '../../data/demoClean.csv')
n <- d %>%
group_by(subject) %>%
summarize(n()) %>%
nrow()
Final sample size is 70.
A work by Dave Braun
dab414@lehigh.edu